#include "image.hh"
#include "matrix.hh"
#include "sizor.hh"
#include <iostream>

#include <cuda.h>

#define NB_BLOCK 8

texture <unsigned char, 2, cudaReadModeElementType> texRef;

__shared__ char m_convo[9];

__global__ void convolution(pixel* out,int w, int h, char* matrix){
  
    if(threadIdx.x==0){
    m_convo[0]=-1;
    m_convo[1]=-1;
    m_convo[2]=-1;
    m_convo[3]=0;
    m_convo[4]=0;
    m_convo[5]=0;
    m_convo[6]=1;
    m_convo[7]=1;
    m_convo[8]=1;
    }
    
    __syncthreads();

  //TMP 
  int matrix_w = 3;
  int matrix_h = 3;
  int divider = 1;
  //TMP

    for(unsigned int j=matrix_w/2;j<w-matrix_w/2;j++){
      float r = 0;
      float g = 0;
      float b = 0;
      float a = 0;

      int x = j - matrix_w/2;
      unsigned int stride = (blockIdx.x*blockDim.x)+threadIdx.x;

      for(unsigned int k = 0; k < matrix_h; k++)
	for(unsigned int l = 0; l < matrix_w; l++){
	  r += tex2D(texRef, (x+l)*sizeof(pixel),   stride+k)*m_convo[k*matrix_w+l];
	  g += tex2D(texRef, (x+l)*sizeof(pixel)+1, stride+k)*m_convo[k*matrix_w+l];
	  b += tex2D(texRef, (x+l)*sizeof(pixel)+2, stride+k)*m_convo[k*matrix_w+l];
	  a += tex2D(texRef, (x+l)*sizeof(pixel)+3, stride+k)*m_convo[k*matrix_w+l];
	}

      out[stride*(w-(2*(matrix_w/2)))+x].r = (r > 0)*(r/divider);
      out[stride*(w-(2*(matrix_w/2)))+x].g = (g > 0)*(g/divider);
      out[stride*(w-(2*(matrix_w/2)))+x].b = (b > 0)*(b/divider);
      out[stride*(w-(2*(matrix_w/2)))+x].a = (a > 0)*(a/divider);
    }
}

int main(int argc, char* argv[]){
  // ==== Chargements et allocations ================================ //
  std::cout<<"Loading...";
  Image input(argv[1]);
  std::cout<<"OK\n";

  //La matrice de convolution
  Matrix<char> matrix(3,3);
  for(unsigned int i = 0; i < 3; i++)
    for(unsigned int j = 0; j < 3; j++)
      if(i == 0)      matrix[i][j] = -1;
      else if(i == 2) matrix[i][j] =  1;
      else            matrix[i][j] =  0; 

  //Image out
  Image output(input.width(),input.height());
  
  //Découpage
  CUDA::Sizor sizor(input,matrix);

  //*
  //host page-locked mem
  pixel* plmem[2];  
  cudaMallocHost(&plmem[0], (sizor.getMaxWidth(CUDA::IN)*sizor.getMaxHeight(CUDA::IN))*sizeof(pixel));
  cudaMallocHost(&plmem[1], (sizor.getMaxWidth(CUDA::IN)*sizor.getMaxHeight(CUDA::IN))*sizeof(pixel));

  //GPU cuArray
  cudaArray* gpmemIN[2];
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
  cudaMallocArray(&gpmemIN[0], &channelDesc,
		  sizor.getMaxWidth(CUDA::IN)*sizeof(pixel),sizor.getMaxHeight(CUDA::IN));
  cudaMallocArray(&gpmemIN[1], &channelDesc,
		  sizor.getMaxWidth(CUDA::IN)*sizeof(pixel),sizor.getMaxHeight(CUDA::IN));


  //GPU out
  pixel* gpmemOUT[2];
  cudaMalloc(&gpmemOUT[0], (sizor.getMaxWidth(CUDA::OUT)*sizor.getMaxHeight(CUDA::OUT))*sizeof(pixel));
  cudaMalloc(&gpmemOUT[1], (sizor.getMaxWidth(CUDA::OUT)*sizor.getMaxHeight(CUDA::OUT))*sizeof(pixel));

  cudaError_t err;

  //*/
  // ==== Traitements par stream =================================== //
  cudaStream_t stream[2];

  for(unsigned int i = 0; i < 2; i++)
    cudaStreamCreate(&stream[i]);
 
  int chunk = 0;
  int block_size = (sizor.getMaxWidth(CUDA::IN)*sizor.getMaxHeight(CUDA::IN));

  while(sizor.has_next()){
    {
      unsigned int i = 0;
      for(; i < sizor.getCurrentHeight(CUDA::IN); i++){
	cudaMemcpyAsync(&plmem[chunk%2][i*sizor.getMaxWidth(CUDA::IN)], 
			&input[sizor.current().byi+i][sizor.current().bxi],
			sizor.getCurrentWidth(CUDA::IN)*sizeof(pixel),
			cudaMemcpyHostToHost, stream[chunk%2]);
      }
      std::cout<<"cpy part1 OK"<<std::endl;
      for(; i < sizor.getMaxHeight(CUDA::IN); i++){
	cudaMemcpyAsync(&plmem[chunk%2][i*sizor.getMaxWidth(CUDA::IN)], 
			&input[sizor.current().byi+sizor.getCurrentHeight(CUDA::IN)-1][sizor.current().bxi],
			sizor.getCurrentWidth(CUDA::IN)*sizeof(pixel),
			cudaMemcpyHostToHost, stream[chunk%2]);
      }
    }
    std::cout<<"Copy plmem OK"<<chunk<<std::endl;

    cudaMemcpyToArrayAsync(gpmemIN[chunk%2], 0, 0, plmem[chunk%2], 
			   block_size*sizeof(pixel), cudaMemcpyHostToDevice,
			   stream[chunk%2]);
    
    std::cout<<"Copy to Array OK"<<chunk<<std::endl;

    //bind
    err = cudaBindTextureToArray(texRef, gpmemIN[chunk%2], channelDesc);
    if(err)
      printf("error Bind : %s\n", cudaGetErrorString(err));
    

    //KERNEL
    convolution<<<MPS,32,0,stream[chunk%2]>>>(gpmemOUT[chunk%2],sizor.getMaxWidth(CUDA::IN),sizor.getMaxHeight(CUDA::IN),matrix[0]);

    //Unbind
    err = cudaUnbindTexture(texRef);
    if(err)
      printf("error unbind : %s\n", cudaGetErrorString(err));
    
    std::cout<<"BIND OK"<<chunk<<std::endl;
    
    for(unsigned int i=0; i<sizor.getCurrentHeight(CUDA::OUT);++i){
      cudaMemcpyAsync(&output[sizor.current().byo+i][sizor.current().bxo],
		      &gpmemOUT[chunk%2][i*sizor.getMaxWidth(CUDA::OUT)],
		      sizor.getCurrentWidth(CUDA::OUT)*sizeof(pixel),
		      cudaMemcpyDeviceToHost,stream[chunk%2]);
    }
    std::cout<<"Copy out OK"<<std::endl;

    chunk++;
    //Ouais bon ... prout
    sizor.next();
  }

  for(unsigned int i = 0; i < 2; i++)
    cudaStreamDestroy(stream[i]);

  output.save("imagesortie.bmp");
  //*/

  //libérations mémoire
  //*
  cudaFreeHost(&plmem[0]);
  cudaFreeHost(&plmem[1]);

  cudaFreeArray(gpmemIN[0]);
  cudaFreeArray(gpmemIN[1]);

  cudaFree(gpmemOUT[0]);
  cudaFree(gpmemOUT[1]);
  //*/
}
